Analysis.Ciona_TMTproC_normalization

Author

Andrea Mariossi

Published

October 25, 2023

1 TMTproC Ciona

We conducted a quantitative complement tandem mass tag (TMTpro) mass spectrometry (MS) analysis (Johnson et al., 2021) of the proteome of Ciona embryos across eight developmental stages, spanning unfertilized eggs, MZT, neurulation, tail elongation, and larval stage. Notably, the use of a multiplexed proteomics method allows for straightforward interpretation of low signal as low protein abundance, presenting a significant advantage over label-free measurements, where the absence of signal may be due to either low abundance or the mass spectrometer’s failure to detect the protein of interest (Pappireddi et al., 2019).

1.0.0.1 Loading libraries

1.0.0.2 Functions and misc

rm(list=ls(all=TRUE))

###############################
#### Directories
###############################

directory <-  c("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/")

directory_save  <-  c("/Users/mariossi/Library/CloudStorage/Dropbox-Princeton/Papers/Proteomics_ciona/c.figures_analysis/")

# directory <- file.choose(). # pick directory where the file is downloaded
raw_dir_proteome="b.raw_data/proteome/" 
ciona_stages <- c("unfE","fertE","cell16","iniG","latN","midTII","latTII","larva")
ciona_stages_beautify <- c("unfertilised","1-cell","16-cell","110-cell","late neurula","mid tailbud","late tailbud","larva")

###############################
#### Themes and colors
###############################

ciona_stages_palette <- c("#009392FF", "#39B185FF", "#9CCB86FF", "#E9E29CFF", "#EEB479FF", "#E88471FF", "#CF597EFF", "#A22E3B")

ciona_cluster_palette <- c("#88A0DCFF", "#381A61FF", "#7C4B73FF", "#ED968CFF", "#AB3329FF", "#E78429FF", "#F9D14AFF", "#7FA074FF") # > paletteer::paletteer_d("MetBrewer::Archambault") + MetBrewer::Cassatt2" - Colorblind-Friendly

###############################
#### Functions
###############################

# Parallelel plot
function_ggparallel <- function(data, col_names, group_col, plot_title, alpha_lines = 1, show_points = TRUE, plot_scale = "globalminmax", ...) {
  # Plot data
  plot <- ggparcoord(data = data,
                     columns = col_names,
                     groupColumn = group_col,
                     title = plot_title,
                     alphaLines = alpha_lines,
                     showPoints = show_points,
                     scale = plot_scale,
                     ...)
  plot <- plot + 
    theme_pubr(base_family = "Avenir") +
    labs(x="") +
    theme(legend.position = "bottom",
          legend.text = element_text(size=15)) # Remove legend
  return(plot)
}

function_plotGuides = guides(
  color = "none", 
  size = guide_legend(title="Proteins number",
                      title.theme = element_text(size = 15,face = "bold"),
  direction = "horizontal",
  title.position = "top",
  nrow=1)) 

# function for cluster plots
plot_cluster_norm <- function(df, title) {
  df %>%  
    pivot_longer(cols = c(1:8), names_to = "stage", values_to = "expression") %>%
    mutate(stage = str_remove(stage, "_byRowCol")) %>% 
    mutate(stage = fct_relevel(stage, ciona_stages), cluster = factor(cluster)) %>% 
    ggplot(aes(stage, expression, color = cluster)) +
    geom_point() +
    geom_line(aes(group = interaction(cluster, size), size = as.factor(size))) +
    scale_color_manual(values = ciona_cluster_palette) +
    theme_pubr(base_family = "Avenir") +
    theme(legend.position = "bottom", plot.title = element_text(size = 10)) +
    labs(title = title, x = "Developmental stages", y="Relative expression") +
    scale_size_discrete(name = "Cluster size") # rename the size legend
}

# function for cluster pivot
cluster_pivot_data <- function(data, cluster, cols) {
  data %>%
    filter(.cluster == cluster) %>%
    dplyr::select(all_of(cols)) %>%
    pivot_longer(-1) %>% 
    mutate(name = case_when(name == "unfE_byRowCol" ~ 1, name == "fertE_byRowCol" ~ 2, name == "cell16_byRowCol" ~ 3, name == "iniG_byRowCol" ~ 4, name == "latN_byRowCol" ~ 5, name == "midTII_byRowCol" ~ 6, name == "latTII_byRowCol" ~ 7, name == "larva_byRowCol" ~ 8))
}

# function to linear regression
plot_cluster_lr <- function(data, title, norm, tag) {
  ggplot(data, aes(x = name, y = value)) +
    geom_point() +
    ylim(0, 8) +
    theme_pubr(base_family = "Avenir") +
    geom_smooth(method = "lm", se = FALSE) +
    labs(title = title,
         subtitle = paste(
           "Adj R^2 = ", signif(summary(lm(value ~ name, data = data))$adj.r.squared, 5), 
           "Intercept = ", signif(coef(lm(value ~ name, data = data))[1], 5),
           "Slope = ", signif(coef(lm(value ~ name, data = data))[2], 5), size = 5),
         tag= tag,
         y="Relative expression",
         x="Stage") +
    if(norm) { scale_y_continuous(labels = function(x) format(x, scientific = FALSE, big.mark = ","))}
}

# check HGNCmitocarta results with plot: individual proteins
plot_HGNCmitocarta_dynamics <- function(from, to) {
  ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
    rename_with(.fn = ~str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
    ungroup() %>%
    slice(from:to) %>%
    pivot_longer(cols = c(4:11),
                 names_to = "stage", 
                 values_to = "expression") %>%
    mutate(stage = fct_relevel(stage, ciona_stages)) %>%
    ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
    geom_point(aes(group = Protein_Id_transcript)) +
    geom_line(aes(group = Protein_Id_transcript)) +
    facet_wrap(~Protein_Id_transcript) +
    theme_pubr(legend = "none",base_family = "Avenir") +
    ylim(0,6)  
}

1.0.0.3 Loading data

###############################
#### Loading the data
###############################

ciona_TMTproC_prenormal.df <- 
  as_tibble(
  read_tsv(file = paste0(directory, raw_dir_proteome, "ky21_w_oxmet_protein_quant_pre.normalisation.tsv"), col_names = T))

# cleaning up the names
lookup <- c("unfE" = "TMTPro126", "fertE" = "TMTPro128C", "cell16" = "TMTPro129N", "iniG" = "TMTPro130C", "latN" = "TMTPro131N", "midTII" = "TMTPro131C", "latTII" = "TMTPro133C", "larva" = "TMTPro134N")

ciona_TMTproC_prenormal.df <-
  ciona_TMTproC_prenormal.df %>%
  rename(all_of(lookup)) %>% # Rename the column
  filter(!grepl("_contaminant", Protein_Id, ignore.case = TRUE)) %>% # exclude human/pig/TEV contaminant, 42 entries
  filter(!grepl("##", Protein_Id, ignore.case = TRUE)) %>% # exclude scramble input for FDR correction
  dplyr::select(-c("Gene_Symbol", "Description", "Group_ID", "Collapsed", "Parsimony", "default~ratios0 sum", "default~ratios9 sum")) %>%
  mutate(Protein_Id_Gene = gsub(pattern = "\\.v\\S*\\..*", replacement = "", Protein_Id)) %>% # transcript to gene ID
  rename(Protein_Id_transcript = Protein_Id) %>%
  arrange(Protein_Id_transcript) %>%
  relocate(Protein_Id_transcript, Protein_Id_Gene, everything())

glimpse(ciona_TMTproC_prenormal.df)
Rows: 7,095
Columns: 11
$ Protein_Id_transcript <chr> "KY21.Chr1.10.v1.nonSL2-1", "KY21.Chr1.1000.v1.SL1-1", "KY21.Chr1.1001.v1.SL1-1", "KY21.Chr1.1002.v1.SL1-1", "KY21.Chr1.1004.v1.nonSL4-1", "KY21.Chr1.1005.v1.ND1-1", "KY21.Chr1.1006.v1.SL1-1", "KY21.Chr1.1009.v1.ND1-1", "KY21.Chr1.101.v1.SL1-1", "KY21.Chr1.1010.v1.nonSL1-1", "KY21.Chr1.1011.v1.SL1-1", "KY21.Chr1.1012.v1.SL1-1", "KY21.Chr1.1013.v1.ND1-1", "KY21.Chr1.1014.v2.ND2-1", "KY21.Chr1.1015.v1.SL1-1", "KY21.Chr1.1016.v1.SL1-1", "KY21.Chr1.1018.v1.nonSL1-1", "KY21.Chr1.1019.v1.SL1-1", "KY21.Chr1.102.v1.nonSL2-1", "KY21.Chr1.1020.v1.SL1-1", "KY21.Chr1.1022.v1.ND1-1", "KY21.Chr1.1024.v1.SL1-1", "KY21.Chr1.1027.v2.nonSL3-1", "KY21.Chr1.1029.v1.SL1-1", "KY21.Chr1.1031.v1.ND1-1", "KY21.Chr1.1035.v1.SL1-1", "KY21.Chr1.1040.v1.nonSL7-1", "KY21.Chr1.1041.v1.nonSL2-1", "KY21.Chr1.1042.v1.nonSL4-1", "KY21.Chr1.1045.v1.SL1-1", "KY21.Chr1.1046.v1.SL1-1", "KY21.Chr1.1050.v1.SL1-1", "KY21.Chr1.1051.v1.SL1-1", "KY21.Chr1.1055.v1.SL1-1", "KY21.Chr1.1056.v1.nonSL3-1…
$ Protein_Id_Gene       <chr> "KY21.Chr1.10", "KY21.Chr1.1000", "KY21.Chr1.1001", "KY21.Chr1.1002", "KY21.Chr1.1004", "KY21.Chr1.1005", "KY21.Chr1.1006", "KY21.Chr1.1009", "KY21.Chr1.101", "KY21.Chr1.1010", "KY21.Chr1.1011", "KY21.Chr1.1012", "KY21.Chr1.1013", "KY21.Chr1.1014", "KY21.Chr1.1015", "KY21.Chr1.1016", "KY21.Chr1.1018", "KY21.Chr1.1019", "KY21.Chr1.102", "KY21.Chr1.1020", "KY21.Chr1.1022", "KY21.Chr1.1024", "KY21.Chr1.1027", "KY21.Chr1.1029", "KY21.Chr1.1031", "KY21.Chr1.1035", "KY21.Chr1.1040", "KY21.Chr1.1041", "KY21.Chr1.1042", "KY21.Chr1.1045", "KY21.Chr1.1046", "KY21.Chr1.1050", "KY21.Chr1.1051", "KY21.Chr1.1055", "KY21.Chr1.1056", "KY21.Chr1.1057", "KY21.Chr1.1058", "KY21.Chr1.1060", "KY21.Chr1.1061", "KY21.Chr1.1062", "KY21.Chr1.1063", "KY21.Chr1.1066", "KY21.Chr1.107", "KY21.Chr1.1072", "KY21.Chr1.1073", "KY21.Chr1.1077", "KY21.Chr1.1078", "KY21.Chr1.108", "KY21.Chr1.1085", "KY21.Chr1.1088", "KY21.Chr1.1089", "KY21.Chr1.109", "KY21.Chr1.1093", "KY21.Chr1.1095", "KY…
$ Number_of_peptides    <dbl> 5, 13, 15, 6, 21, 9, 1, 12, 9, 1, 2, 24, 5, 25, 1, 11, 3, 11, 2, 2, 2, 10, 7, 2, 3, 8, 2, 1, 8, 4, 8, 17, 6, 14, 3, 13, 1, 4, 23, 3, 5, 13, 9, 34, 7, 11, 1, 2, 2, 17, 12, 1, 12, 2, 39, 5, 6, 2, 4, 5, 5, 4, 7, 4, 30, 8, 2, 17, 7, 142, 23, 1, 1, 7, 6, 14, 15, 5, 5, 4, 13, 9, 4, 7, 10, 3, 17, 10, 10, 1, 5, 1, 6, 6, 54, 4, 14, 32, 5, 19, 3, 1, 8, 10, 1, 5, 10, 4, 2, 2, 25, 2, 5, 64, 3, 5, 7, 10, 1, 1, 2, 5, 9, 26, 6, 22, 5, 8, 2, 1, 4, 27, 1, 1, 33, 19, 1, 13, 11, 1, 4, 4, 2, 15, 2, 20, 8, 24, 3, 2, 3, 4, 24, 15, 5, 3, 1, 1, 6, 1, 3, 24, 19, 9, 1, 26, 7, 8, 5, 1, 42, 11, 1, 13, 2, 4, 29, 22, 3, 7, 6, 2, 1, 13, 19, 3, 28, 8, 1, 4, 15, 9, 3, 25, 11, 7, 4, 14, 2, 9, 6, 7, 8, 10, 3, 5, 6, 1, 6, 1, 3, 1, 20, 16, 28, 4, 2, 3, 11, 1, 18, 2, 2, 3, 6, 6, 9, 2, 10, 8, 4, 2, 1, 1, 4, 4, 3, 7, 1, 5, 24, 1, 5, 2, 23, 3, 2, 15, 18, 17, 3, 15, 2, 1, 18, 2, 4, 7, 4, 13, 9, 10, 1, 21, 2, 15, 8, 2, 2, 10, 8, 3, 4, 29, 10, 4, 10, 38, 3, 3, 4, 3, 12, 12, 4, 53, 15, 3, 4, 1, 8, …
$ unfE                  <dbl> 6.79550e-01, 1.80478e+00, 1.77061e+00, 2.66736e+00, 2.49546e+00, 1.18459e+00, 2.12080e-01, 2.15231e+00, 9.83251e-01, 2.93230e-01, 1.81664e-01, 2.67143e+00, 6.81070e-01, 3.21492e+00, 1.40200e-01, 1.19374e+00, 3.10652e-01, 1.16788e+00, 8.01760e-02, 3.84460e-01, 1.16219e-01, 1.41303e+00, 7.58105e-01, 2.03191e-01, 3.69990e-01, 6.47148e-01, 1.62202e-01, 2.97860e-01, 1.16880e+00, 3.64094e+00, 4.94889e-01, 2.19138e+00, 5.69127e-01, 1.96017e+00, 4.12340e-01, 2.42170e+00, 1.33990e-01, 3.06282e-01, 2.02815e+00, 3.20270e-01, 8.49990e-02, 1.45603e+00, 9.73567e-01, 4.39201e+00, 3.04738e-01, 1.29174e+00, 8.45240e-01, 9.60280e-02, 2.95710e-01, 1.86714e+00, 1.45689e+00, 8.31570e-03, 1.00654e+00, 1.60516e-01, 4.50932e+00, 2.96806e-01, 7.19776e-01, 2.56220e-01, 1.56429e-01, 6.51930e-01, 6.41110e-01, 5.46610e-01, 8.60690e-01, 2.85782e-01, 3.97738e+00, 8.18631e-01, 4.01320e-01, 2.18791e+00, 6.97508e-01, 1.92738e+01, 5.85748e-01, 1.95630e-01, 2.70560e-02, 2.27420e+00, 6.7773…
$ fertE                 <dbl> 0.57898000, 1.61365000, 1.47541000, 2.22937000, 2.12542000, 1.04231000, 0.16163000, 1.79585000, 0.83099500, 0.28886000, 0.15184300, 2.39866000, 0.51786800, 2.79024000, 0.11953000, 1.07551000, 0.34142300, 0.99176000, 0.06759600, 0.24660000, 0.10924900, 1.32116000, 0.65158100, 0.15684400, 0.29204400, 0.51322100, 0.15263800, 0.07427600, 0.95676400, 0.22614100, 0.54806600, 1.91668000, 0.33684600, 1.65419000, 0.33958000, 2.13430000, 0.10245000, 0.26434300, 1.88775000, 0.29613000, 0.06304520, 1.34814000, 0.89273200, 3.75227000, 0.26859700, 1.10863000, 0.05636300, 0.10972500, 0.22016300, 1.75429000, 1.14763000, 0.14042000, 0.80252200, 0.14261900, 3.81532000, 0.24705600, 0.64934400, 0.22233000, 0.28445900, 0.57858800, 0.58369100, 0.47278000, 0.77612000, 0.30220000, 3.58440000, 0.76841400, 0.20288900, 1.84208000, 0.81967000, 16.18490000, 0.62608200, 0.58697000, 0.10912000, 0.77289200, 0.58463500, 1.66089000, 1.53299000, 0.51176500, 0.37952900, 0.43241300, 1.30337…
$ cell16                <dbl> 6.30810e-01, 1.74379e+00, 1.84235e+00, 3.92345e-01, 2.61606e+00, 1.54636e+00, 2.81570e-02, 2.01416e+00, 9.19726e-01, 4.99240e-02, 1.96029e-01, 2.64079e+00, 5.92140e-01, 3.23076e+00, 1.21340e-01, 1.34959e+00, 4.17450e-01, 1.27442e+00, 1.59875e-01, 2.46230e-01, 2.00173e-01, 1.22588e+00, 8.85627e-01, 2.39350e-01, 3.57280e-01, 8.26071e-01, 1.94077e-01, 9.31880e-02, 1.02497e+00, 4.03112e-02, 1.14933e+00, 2.36967e+00, 6.11220e-01, 1.99817e+00, 4.15720e-01, 1.85064e+00, 1.27860e-01, 3.48907e-01, 2.67077e+00, 3.12358e-01, 5.07407e-02, 1.58694e+00, 1.20904e+00, 4.44341e+00, 3.90184e-01, 1.31081e+00, 4.67240e-04, 1.69883e-01, 2.13952e-01, 2.15117e+00, 1.54993e+00, 5.47300e-02, 1.12241e+00, 1.93090e-01, 4.62140e+00, 3.23287e-01, 7.08080e-01, 2.98280e-01, 2.00552e-01, 6.54220e-01, 6.74670e-01, 6.17570e-01, 9.20190e-01, 4.96919e-01, 3.99285e+00, 9.98534e-01, 4.39920e-01, 2.23781e+00, 8.37260e-01, 1.84059e+01, 1.26432e+00, 7.63360e-02, 7.70780e-02, 1.11694e+00, 7.0421…
$ iniG                  <dbl> 5.66530e-01, 1.40743e+00, 1.66590e+00, 8.85543e-02, 2.41560e+00, 1.25660e+00, 2.28600e-02, 1.09708e+00, 1.04635e+00, 4.16140e-02, 2.22320e-01, 2.66642e+00, 4.98090e-01, 3.00702e+00, 1.45100e-01, 1.23986e+00, 3.92330e-01, 1.35977e+00, 1.82828e-01, 1.73323e-01, 3.41980e-01, 6.05508e-01, 7.58633e-01, 2.23890e-01, 2.58228e-01, 9.31983e-01, 2.46000e-01, 4.29070e-02, 8.93905e-01, 3.44082e-03, 9.58860e-01, 1.84328e+00, 5.95189e-01, 1.82242e+00, 3.33750e-01, 1.41622e+00, 1.31940e-01, 4.51420e-01, 2.26438e+00, 3.32360e-01, 4.50053e-02, 1.68475e+00, 1.10516e+00, 3.85193e+00, 5.11094e-01, 1.07302e+00, 5.28620e-04, 2.51760e-01, 1.86955e-01, 2.08841e+00, 1.48001e+00, 1.38720e-01, 1.00068e+00, 2.70140e-01, 4.35132e+00, 8.75432e-02, 6.27003e-01, 2.27230e-01, 1.79769e-01, 5.23063e-01, 5.63302e-01, 6.11400e-01, 7.82620e-01, 6.78410e-01, 3.64809e+00, 1.19590e+00, 1.70195e-01, 1.59778e+00, 7.69533e-01, 1.61785e+01, 1.57272e+00, 6.54920e-02, 1.65220e-01, 1.15200e+00, 6.7515…
$ latN                  <dbl> 6.56020e-01, 1.60770e+00, 1.92901e+00, 1.48615e-01, 2.84228e+00, 1.15077e+00, 1.41600e-01, 1.24657e+00, 1.23647e+00, 5.97930e-02, 2.87670e-01, 3.11005e+00, 6.75230e-01, 3.52794e+00, 1.33690e-01, 1.46253e+00, 3.92100e-01, 1.55889e+00, 2.93490e-01, 2.01361e-01, 4.45370e-01, 8.74752e-01, 8.09143e-01, 2.62910e-01, 4.10610e-01, 1.09934e+00, 2.50590e-01, 1.83880e-02, 1.02141e+00, 2.00058e-03, 1.26086e+00, 2.12950e+00, 7.99010e-01, 2.13169e+00, 4.19640e-01, 1.45474e+00, 1.41710e-01, 5.24710e-01, 3.08728e+00, 4.23560e-01, 1.46835e+00, 1.96051e+00, 1.19747e+00, 4.43640e+00, 1.06858e+00, 1.43388e+00, 5.32300e-04, 3.27810e-01, 1.89183e-01, 2.30298e+00, 1.59574e+00, 7.38420e-02, 1.53803e+00, 3.23490e-01, 5.04323e+00, 1.04762e-01, 8.44640e-01, 2.39760e-01, 1.57163e-01, 6.84430e-01, 6.94280e-01, 6.07570e-01, 8.23100e-01, 6.68700e-01, 4.15992e+00, 1.31243e+00, 1.50844e-01, 1.99327e+00, 8.42364e-01, 1.75978e+01, 2.85044e+00, 5.03780e-02, 1.42170e-01, 9.48280e-01, 8.2858…
$ midTII                <dbl> 0.51649100, 1.47614000, 1.86640000, 0.14455300, 2.73108000, 1.00493000, 0.13914000, 1.19404000, 1.15203000, 0.05498100, 0.26129000, 3.13778000, 0.61660000, 3.08246000, 0.11558000, 1.39207000, 0.38579000, 1.39137000, 0.35927000, 0.27412000, 0.30976000, 1.06056000, 0.82864000, 0.28930000, 0.39583000, 1.10481000, 0.23575000, 0.06784000, 0.88612100, 0.00320937, 1.12370000, 1.84402000, 0.79298300, 1.58788000, 0.33307000, 1.19009000, 0.12321000, 0.59921000, 3.08591000, 0.38369000, 1.07596000, 1.66018000, 1.16994000, 4.11350000, 1.30775000, 1.49254000, 0.00027092, 0.31506000, 0.21978600, 2.24189000, 1.41341000, 0.10162000, 1.78241000, 0.34303000, 5.25463000, 0.33050300, 0.75703000, 0.20418100, 0.29036500, 0.60226000, 0.58384600, 0.43159000, 0.67521200, 0.61858000, 3.64306000, 1.22484000, 0.19632100, 1.80701000, 0.90437000, 16.52180000, 4.15660000, 0.02449100, 0.14110000, 0.37511500, 0.77145000, 1.70249000, 1.65501000, 0.58259000, 0.79794000, 0.46438000, 1.61223…
$ latTII                <dbl> 0.71847000, 1.81019000, 2.29295000, 0.19764400, 2.99856000, 0.95946700, 0.15448000, 1.39381000, 1.50655000, 0.13078000, 0.36377000, 3.98064000, 0.73932000, 3.19899000, 0.11578000, 1.73527000, 0.39115000, 1.74421000, 0.40071000, 0.24575000, 0.28959000, 1.66706000, 1.11664000, 0.32713000, 0.45108000, 1.43025000, 0.41443000, 0.13884000, 1.06728000, 0.00861615, 1.43945000, 2.44021000, 0.94058300, 1.60715000, 0.38654000, 1.46237000, 0.11509000, 0.70846000, 4.22869000, 0.49455000, 1.10235000, 1.72177000, 1.28422000, 4.71681000, 2.00193000, 1.79324000, 0.04356600, 0.38657000, 0.34050000, 2.29164000, 1.71192000, 0.24534000, 2.34237000, 0.31987000, 5.60966000, 1.32448000, 0.86305000, 0.26963000, 1.01290000, 0.65483000, 0.65851000, 0.40867400, 1.10525000, 0.51764000, 3.90885000, 1.04650000, 0.26919000, 2.76587000, 1.04617000, 20.00400000, 5.74928000, 0.00034559, 0.19598000, 0.22023100, 0.89548000, 1.21065000, 2.35680000, 0.76510000, 0.65684500, 0.51234000, 1.74198…
$ larva                 <dbl> 0.65318000, 1.53632000, 2.15739000, 0.13155000, 2.77555000, 0.85500400, 0.14004000, 1.10616000, 1.32461000, 0.08081900, 0.33542000, 3.39425000, 0.67965500, 2.94764000, 0.10878000, 1.55143000, 0.36908900, 1.51174000, 0.45606000, 0.22818000, 0.18767700, 1.83204000, 1.19164000, 0.29738000, 0.46495000, 1.44716000, 0.34432000, 0.26671000, 0.98072000, 0.07533700, 1.02489000, 2.26527000, 1.35504000, 1.23834000, 0.35935000, 1.06997000, 0.12374000, 0.79668000, 3.74706000, 0.43708000, 1.10956000, 1.58167000, 1.16787000, 4.29374000, 1.14713000, 1.49614000, 0.05302900, 0.34318000, 0.33375000, 2.30244000, 1.64451000, 0.23702000, 2.40503000, 0.24724000, 5.79510000, 2.28556000, 0.83109000, 0.28237000, 1.71836000, 0.65069000, 0.60056000, 0.30379800, 1.05683000, 0.43174600, 3.08550000, 0.63474200, 0.16932400, 2.56829000, 1.08313000, 17.83330000, 6.19484000, 0.00035054, 0.14228000, 0.14032400, 0.86275000, 0.81396900, 1.99099000, 0.64302500, 0.61435000, 0.43833100, 1.72909…
# ciona_Number_of_peptides.df <- ciona_TMTproC_prenormal.df[,1:3]

vis_dat(ciona_TMTproC_prenormal.df)

vis_miss(ciona_TMTproC_prenormal.df)

miss_case_summary(ciona_TMTproC_prenormal.df)
# A tibble: 7,095 × 3
    case n_miss pct_miss
   <int>  <int>    <dbl>
 1     1      0        0
 2     2      0        0
 3     3      0        0
 4     4      0        0
 5     5      0        0
 6     6      0        0
 7     7      0        0
 8     8      0        0
 9     9      0        0
10    10      0        0
# ℹ 7,085 more rows
# 7095 protein entries

1.0.1 Standard Normalisation

1.0.1.0.1 Normalisation step using rowMean 1st follow by columnMedian
###############################
#### Normalise each row by its average and then by the median columns
###############################
ciona_TMTproC_rowMean.df <-
  ciona_TMTproC_prenormal.df %>%
  rowwise() %>% # row-wise operations
  mutate(row_mean = rowMeans(pick(c(4:11)))) %>% # calculate mean by row
  mutate(across(c(4:11), ~ . / row_mean)) %>% # normalise by dividing each row by its mean
  dplyr::select(-row_mean) %>% # remove the row_mean column
  rename_at(c(4:11), ~ paste0(., "_by_rowMean")) # rename columns with _by_rowMean suffix

ciona_TMTproC_colMedian_normVector <-
  ciona_TMTproC_rowMean.df %>%
  ungroup() %>%
  summarize(across(.cols = 4:11, .fns = median)) # summarize columns median

# # Double checking results
# col_medians <- apply(ciona_TMTproC_rowMean.df[,-c(1:3)], 2, median)
# col_medians_tbl <- as_tibble(t(col_medians))

# normalised by column median and cleaning up the names
ciona_TMTproC_rowMean_colMedian.df <-
  ciona_TMTproC_rowMean.df %>%
  dplyr::select(c(4:11)) %>%
  map2_dfc(ciona_TMTproC_colMedian_normVector, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
  rename_with(
    .fn = ~ str_replace(., "_by_rowMean$", "_byRowCol"),
    .cols = ends_with("_by_rowMean")
  ) %>%
  bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal")

# write.table(ciona_TMTproC_rowMean_colMedian.df, "normalized_counts_notGood_TMTproC_ciona.txt", quote=F, col.names=T, row.names=F, sep="\t")

# Plotting raw data
plot_original_ciona <-
  ciona_TMTproC_prenormal.df %>%
  pivot_longer(cols = -c(1:3), names_to = "stage", names_transform = as.factor) %>%
  ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +
  geom_boxplot() +
  scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  labs(title = "Raw data", y = "Developmental stages", x = " Relative abundance") +
  theme_pubr(base_family = "Avenir") +
  theme(legend.position = "none")

# Plotting normalised data
plot_normRowCol_ciona <-
  ciona_TMTproC_rowMean_colMedian.df %>%
  pivot_longer(-c(1:3), names_to = "stage", names_transform = as.factor) %>%
  filter(str_detect(stage, "_byRowCol")) %>%
  mutate(stage = str_remove(stage, "_byRowCol")) %>%
  ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +
  geom_boxplot() +
  scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  labs(title = "Normalised not good", y = "Developmental stages", x = " Relative abundance") +
  theme_pubr(base_family = "Avenir") +
  theme(legend.position = "none")

plot_original_ciona + plot_normRowCol_ciona

1.0.1.0.2 Assessing the normalisation step

CLuster 6 contains protein related to ribosomes, mitochondria, and housekeeping genes. This cluster’s expression decreases over time. Try a different normalization method to emove this technical variation and enhance the biological signals.

###############################
#### Evaluating the normalization process through k-means clustering.
###############################

set.seed(777) # reproduce the clusters

# Fit and predict clusters with k = 8
ciona_kmeans <- kmeans(ciona_TMTproC_rowMean_colMedian.df[, 4:11], centers = 8, nstart = 100, iter.max = 1000)

glance(ciona_kmeans) # extracts a single-row summary: totss, tot.withinss of clustering
# A tibble: 1 × 4
   totss tot.withinss betweenss  iter
   <dbl>        <dbl>     <dbl> <int>
1 13307.        3127.    10179.     6
# clean up the clustering and add cluster prediction to the original data set
ciona_results <- augment(ciona_kmeans, ciona_TMTproC_rowMean_colMedian.df)
ciona_results %>% slice_head(n = 5)
# A tibble: 5 × 12
  Protein_Id_transcript      Protein_Id_Gene Number_of_peptides unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol .cluster
  <chr>                      <chr>                        <dbl>         <dbl>          <dbl>           <dbl>         <dbl>         <dbl>           <dbl>           <dbl>          <dbl> <fct>   
1 KY21.Chr1.10.v1.nonSL2-1   KY21.Chr1.10                     5          1.21           1.21           1.07          1.02          0.999           0.838           0.942          0.935 6       
2 KY21.Chr1.1000.v1.SL1-1    KY21.Chr1.1000                  13          1.23           1.29           1.14          0.978         0.942           0.921           0.913          0.845 6       
3 KY21.Chr1.1001.v1.SL1-1    KY21.Chr1.1001                  15          1.05           1.03           1.04          1.00          0.980           1.01            1.00           1.03  4       
4 KY21.Chr1.1002.v1.SL1-1    KY21.Chr1.1002                   6          3.95           3.87           0.554         0.133         0.189           0.195           0.216          0.157 2       
5 KY21.Chr1.1004.v1.nonSL4-1 KY21.Chr1.1004                  21          1.06           1.06           1.06          1.04          1.03            1.05            0.937          0.946 4       
# summarizes  per-cluster level info eg how many point each cluster and within cluster sum of squares
ciona_cluster_avgSummary <- tidy(ciona_kmeans)
ciona_cluster_avgSummary
# A tibble: 8 × 11
  unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol  size withinss cluster
          <dbl>          <dbl>           <dbl>         <dbl>         <dbl>           <dbl>           <dbl>          <dbl> <int>    <dbl> <fct>  
1         0.446         0.462            0.598        0.767         1.06            1.34             1.46           1.60    822    518.  1      
2         2.52          2.19             1.26         0.597         0.520           0.490            0.526          0.589   201    484.  2      
3         0.534         0.612            0.915        1.45          1.47            1.28             0.928          0.861   405    378.  3      
4         0.929         0.939            0.952        1.00          1.02            1.04             1.08           1.11   2429    419.  4      
5         7.30          0.604            0.104        0.0796        0.0713          0.0965           0.156          0.390    37     37.5 5      
6         1.22          1.23             1.17         1.05          0.995           0.928            0.861          0.824  2602    633.  6      
7         0.126         0.0906           0.114        0.0852        0.112           0.455            1.81           4.35    191    218.  7      
8         0.260         0.191            0.203        0.189         0.459           1.21             2.15           2.64    408    439.  8      
# Visualise dynamics x cluster. Eg cluster 1 should be flatter
function_ggparallel(
  data = ciona_cluster_avgSummary %>% rename_with(~ str_remove(., "_byRowCol")),
  col_names = 1:8,
  group_col = "cluster",
  plot_title = "Ciona: cluster center dynamcis"
) +
  geom_line(aes(color = cluster, size = as.factor(size))) +
  scale_color_manual(values = ciona_cluster_palette) +
  theme_pubr(base_family = "Avenir") +
  facet_grid(~cluster) +
  # Visualise each protein behavior x cluster
  function_ggparallel(
    data = ciona_results %>% rename_with(~ str_remove(., "_byRowCol")),
    col_names = 4:11,
    group_col = ".cluster",
    plot_title = "Ciona: proteins dynamics x cluster"
  ) +
  # geom_line(aes(color=cluster, size=as.factor(size))) +
  scale_color_manual(values = ciona_cluster_palette) +
  theme_pubr(base_family = "Avenir") +
  facet_grid(~.cluster) +
  theme(legend.position = "none") +
  plot_layout(ncol = 1) & theme(plot.margin = unit(c(0.5, 0.5, 0, 0.5), "lines"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1)) & theme_pubr(base_family = "Avenir")

# ciona_results %>% rename_with(~ str_remove(., "_byRowCol")) %>%
#   pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
#   mutate(stage = fct_relevel(stage, ciona_stages)) %>%
#   ggplot(aes(stage,expression, color = .cluster)) +
#   geom_point(aes(group = Protein_Id_transcript)) +
#   geom_line(aes(group = Protein_Id_transcript)) +
#   scale_color_manual(values = ciona_cluster_palette) +
#   facet_wrap(~.cluster) + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

1.0.2 Improving the normalization process

It seems that dividing the protein values by the median column values is not an effective method for normalizing this data because the “flat” proteins are decreasing over time. As an alternative, we try using ribosomal and/or mitochondrial proteins as internal standards to improve the normalization process and better capture the stable proteins.

1.0.2.1 Mitocarta

The mitochondrial protein annotation provided by the Mitocarta database (https://www.broadinstitute.org/mitocarta/mitocarta30-inventory-mammalian-mitochondrial-proteins-and-pathways) includes only Entrez IDs. However, the Ciona gff3 annotation utilizes HGCN IDs. To map between these two annotations, we first use biomart and then select only the mitochondrial proteins present in our TMTcPro dataframe.

1.0.2.1.1 Genes and protein annotation
# Annotating by gff3
ciona_annotation_gff3 <- as_tibble(read.delim(file = paste0(directory, "a.reference_annotation/annotation_ciona.KY21ensembl.109.MT.tsv"), header = TRUE, sep = "\t", dec = ".", strip.white = T)) %>% dplyr::select(1, 2, 10, 11, 12)
glimpse(ciona_annotation_gff3)
Rows: 55,205
Columns: 5
$ GeneID         <chr> "KY21.Chr13.334", "KY21.Chr11.1168", "KY21.Chr11.1170", "KY21.Chr9.1164", "KY21.Chr7.558", "KY21.Chr11.343", "KY21.Chr12.128", "KY21.Chr2.200", "KY21.Chr3.56", "KY21.Chr1.156", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr1.157", "KY21.Chr9.577", "KY21.Chr9.577", "KY21.Chr12.699", "KY21.Chr12.699", "KY21.Chr9.398", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig13.6", "KY21.UAContig32.28", "KY21.UAContig36.22", "KY21.UAContig49.9", "KY21.Chr8.497", "KY21.Chr7.192", "KY21.Chr9.981", "KY21.Chr9.1132", "KY21.Chr13.101", "KY21.Chr7.880", "KY21.Chr10.26", "KY21.Chr1.1883", "KY21.Chr2.1022", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr2.1023", "KY21.Chr3.114", "KY21.Chr3.115", "KY21.Chr3.115", "KY21.Chr14.562", "KY21.Chr14.562", "KY21.UAContig2.22", "KY21.Chr1.2027", "KY21.Chr5.488", "KY21.Chr7.427"…
$ TranscriptID   <chr> "KY21.Chr13.334.v1.ND1-1", "KY21.Chr11.1168.v2.SL1-1", "KY21.Chr11.1170.v1.SL1-1", "KY21.Chr9.1164.v1.ND1-1", "KY21.Chr7.558.v1.ND1-1", "KY21.Chr11.343.v1.nonSL1-1", "KY21.Chr12.128.v1.ND1-1", "KY21.Chr2.200.v1.SL1-1", "KY21.Chr3.56.v1.ND1-1", "KY21.Chr1.156.v1.SL1-1", "KY21.Chr1.157.v1.SL2-1", "KY21.Chr1.157.v1.SL1-1", "KY21.Chr1.157.v1.nonSL4-1", "KY21.Chr1.157.v1.nonSL6-1", "KY21.Chr1.157.v1.nonSL5-1", "KY21.Chr1.157.v1.nonSL8-1", "KY21.Chr1.157.v1.SL3-1", "KY21.Chr1.157.v1.nonSL7-1", "KY21.Chr9.577.v1.nonSL1-1", "KY21.Chr9.577.v1.nonSL2-1", "KY21.Chr12.699.v1.nonSL1-1", "KY21.Chr12.699.v1.nonSL2-1", "KY21.Chr9.398.v1.SL1-1", "KY21.UAContig13.6.v1.nonSL1-1", "KY21.UAContig13.6.v1.nonSL3-1", "KY21.UAContig13.6.v1.nonSL4-1", "KY21.UAContig13.6.v1.nonSL2-1", "KY21.UAContig13.6.v1.nonSL5-1", "KY21.UAContig32.28.v1.ND1-1", "KY21.UAContig36.22.v1.ND1-1", "KY21.UAContig49.9.v1.ND1-1", "KY21.Chr8.497.v1.SL1-1", "KY21.Chr7.192.v1.ND1-1", "KY21.Chr9.981.v1.ND1-1", "KY…
$ HumanID        <chr> "A2M", "AADAC", "AADAC", "AAMP", "AANAT", "AANAT", "AANAT", "AARS1", "AARS1", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABAT", "ABCA10", "ABCA10", "ABCA2", "ABCA2", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA3", "ABCA4", "ABCA5", "ABCA5", "ABCA5", "ABCA5", "ABCA8", "ABCA8", "ABCA9", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB1", "ABCB10", "ABCB11", "TAP1", "ABCB5", "ABCB6", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB7", "ABCB8", "ABCB8", "ABCB8", "ABCB9", "ABCB9", "ABCB9", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "ABCC1", "AB…
$ HGNC           <chr> "HGNC:7", "HGNC:17", "HGNC:17", "HGNC:18", "HGNC:19", "HGNC:19", "HGNC:19", "HGNC:20", "HGNC:20", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:23", "HGNC:30", "HGNC:30", "HGNC:32", "HGNC:32", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:33", "HGNC:34", "HGNC:35", "HGNC:35", "HGNC:35", "HGNC:35", "HGNC:38", "HGNC:38", "HGNC:39", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:40", "HGNC:41", "HGNC:42", "HGNC:43", "HGNC:46", "HGNC:47", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:48", "HGNC:49", "HGNC:49…
$ Human_longname <chr> "alpha-2-macroglobulin", "arylacetamide deacetylase", "arylacetamide deacetylase", "angio associated migratory cell protein", "aralkylamine N-acetyltransferase", "aralkylamine N-acetyltransferase", "aralkylamine N-acetyltransferase", "alanyl-tRNA synthetase 1", "alanyl-tRNA synthetase 1", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "4-aminobutyrate aminotransferase", "ATP binding cassette subfamily A member 10", "ATP binding cassette subfamily A member 10", "ATP binding cassette subfamily A member 2", "ATP binding cassette subfamily A member 2", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3", "ATP binding cassette subfamily A member 3",…
# Annotating by HGNC
human_mitocarta <- read_excel(paste0(directory, "a.reference_annotation/Human.MitoCarta3.0.xls"), sheet = "A Human MitoCarta3.0", trim_ws = T, col_names = T) %>%
  dplyr::select(1, 3, 5) %>%
  mutate(human_mitocarta = "human_mitocarta")
# dplyr::select(1,3,5,15,16) $uniprot lenght and geneID
glimpse(human_mitocarta)
Rows: 1,136
Columns: 4
$ HumanGeneID     <dbl> 1537, 6390, 10229, 6389, 7384, 84274, 5160, 57017, 6182, 513, 9377, 122961, 9512, 7386, 498, 4967, 5162, 7385, 6392, 60488, 27089, 116540, 1629, 5166, 4191, 23107, 1431, 80273, 1737, 10128, 1743, 8050, 85476, 25874, 4719, 26589, 514, 5250, 51649, 2271, 23203, 506, 135154, 29796, 81689, 8803, 51805, 55699, 64960, 3419, 1353, 2110, 26519, 64981, 51069, 593, 7388, 192286, 539, 1892, 25875, 1337, 1355, 10939, 3030, 2108, 374291, 1376, 594, 3420, 23395, 35, 3954, 521, 4976, 549, 8802, 4729, 51004, 84545, 83451, 522, 4714, 9361, 1738, 9131, 79746, 139322, 124995, 92399, 4728, 34, 10989, 26520, 291, 25813, 4720, 4723, 50, 6832, 2235, 4528, 5245, 26275, 51116, 3313, 6834, 10935, 27069, 60558, 26517, 57128, 54948, 64976, 3421, 6391, 4711, 54949, 93058, 8209, 25828, 55168, 1345, 4715, 23788, 4700, 788, 65008, 10469, 10063, 440574, 6648, 1340, 7416, 8192, 3033, 33, 38, 115416, 10845, 4724, 51073, 708, 10531, 55245, 128308, 79922, 51102, 65080, 84263, 63931, 11194, 91647,…
$ Symbol          <chr> "CYC1", "SDHB", "COQ7", "SDHA", "UQCRC1", "COQ5", "PDHA1", "COQ9", "MRPL12", "ATP5F1D", "COX5A", "ISCA2", "PMPCB", "UQCRFS1", "ATP5F1A", "OGDH", "PDHB", "UQCRC2", "SDHD", "MRPS35", "UQCRQ", "MRPL53", "DBT", "PDK4", "MDH2", "MRPS27", "CS", "GRPEL1", "DLAT", "LRPPRC", "DLST", "PDHX", "GFM1", "MPC2", "NDUFS1", "MRPL46", "ATP5F1E", "SLC25A3", "MRPS23", "FH", "PMPCA", "ATP5F1B", "SDHAF4", "UQCR10", "ISCA1", "SUCLA2", "COQ3", "IARS2", "MRPS15", "IDH3A", "COX11", "ETFDH", "TIMM10", "MRPL34", "MRPL2", "BCKDHA", "UQCRH", "HIGD2A", "ATP5PO", "ECHS1", "LETMD1", "COX6A1", "COX15", "AFG3L2", "HADHA", "ETFA", "NDUFS7", "CPT2", "BCKDHB", "IDH3B", "LARS2", "ACADS", "LETM1", "ATP5ME", "OPA1", "AUH", "SUCLG1", "NDUFV2", "COQ6", "MRPL43", "ABHD11", "ATP5PF", "NDUFB8", "LONP1", "DLD", "AIFM1", "ECHDC3", "APOOL", "MRPL10", "MRRF", "NDUFS8", "ACADM", "IMMT", "TIMM9", "SLC25A4", "SAMM50", "NDUFS2", "NDUFV1", "ACO2", "SUPV3L1", "FECH", "MTIF2", "PHB", "HIBCH", "MRPS2", "HSPA9", "SURF…
$ Description     <chr> "cytochrome c1", "succinate dehydrogenase complex iron sulfur subunit B", "coenzyme Q7, hydroxylase", "succinate dehydrogenase complex flavoprotein subunit A", "ubiquinol-cytochrome c reductase core protein 1", "coenzyme Q5, methyltransferase", "pyruvate dehydrogenase E1 subunit alpha 1", "coenzyme Q9", "mitochondrial ribosomal protein L12", "ATP synthase F1 subunit delta", "cytochrome c oxidase subunit 5A", "iron-sulfur cluster assembly 2", "peptidase, mitochondrial processing subunit beta", "ubiquinol-cytochrome c reductase, Rieske iron-sulfur polypeptide 1", "ATP synthase F1 subunit alpha", "oxoglutarate dehydrogenase", "pyruvate dehydrogenase E1 subunit beta", "ubiquinol-cytochrome c reductase core protein 2", "succinate dehydrogenase complex subunit D", "mitochondrial ribosomal protein S35", "ubiquinol-cytochrome c reductase complex III subunit VII", "mitochondrial ribosomal protein L53", "dihydrolipoamide branched chain transacylase E2", "pyruvate dehydr…
$ human_mitocarta <chr> "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "human_mitocarta", "huma…
1.0.2.1.2 Mapping NCBI Entrez Gene ID of Mitocarta to HGNC symbols
###############################
#### Convert the NCBI Entrez Gene IDs from Mitocarta to their corresponding HGNC symbols.
###############################

ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # use human genes dataset

# head(listAttributes(ensembl),20) # define the attributes to retrieve (HGNC symbol and Entrez Gene ID)

# split in two to handle the number of attributes
attributes <- c("entrezgene_id", "hgnc_symbol", "hgnc_id", "description")
attributes2 <- c("entrezgene_id", "entrezgene_accession") # "entrezgene_description"

# convert Entrez Gene ID of mitocarta to HGNC symbols
entrez_ids <- human_mitocarta$HumanGeneID # 1136

hgnc_symbols <- getBM(
  attributes = attributes,
  filters = "entrezgene_id",
  values = entrez_ids,
  mart = ensembl
)
hgnc_symbols2 <- getBM(
  attributes = attributes2,
  filters = "entrezgene_id",
  values = entrez_ids,
  mart = ensembl
)

hgnc_symbols <-
  hgnc_symbols %>%
  left_join(hgnc_symbols2, by = "entrezgene_id") %>% # Merge the two dfs
  relocate(hgnc_id, entrezgene_accession, hgnc_symbol, entrezgene_id, description)
glimpse(hgnc_symbols)
Rows: 1,137
Columns: 5
$ hgnc_id              <chr> "HGNC:40910", "HGNC:35410", "HGNC:40038", "HGNC:993", "HGNC:994", "HGNC:34528", "HGNC:35428", "HGNC:17312", "HGNC:40045", "HGNC:38844", "HGNC:47", "HGNC:2973", "HGNC:2264", "HGNC:44317", "HGNC:12367", "HGNC:9259", "HGNC:15714", "HGNC:16264", "HGNC:17366", "HGNC:10983", "HGNC:10985", "HGNC:50696", "HGNC:20567", "HGNC:18349", "HGNC:2244", "HGNC:16632", "HGNC:17310", "HGNC:16897", "HGNC:13734", "HGNC:16902", "HGNC:39", "HGNC:12730", "HGNC:1530", "HGNC:6737", "HGNC:17315", "HGNC:83", "HGNC:18001", "HGNC:14601", "HGNC:1329", "HGNC:17316", "HGNC:845", "HGNC:17663", "HGNC:17169", "HGNC:16637", "HGNC:14484", "HGNC:18155", "HGNC:7437", "HGNC:8587", "HGNC:14247", "HGNC:24639", "HGNC:7506", "HGNC:21062", "HGNC:9186", "HGNC:12843", "HGNC:344", "HGNC:7434", "HGNC:3978", "HGNC:2088", "HGNC:6985", "HGNC:8769", "HGNC:16985", "HGNC:24676", "HGNC:9354", "HGNC:315", "HGNC:15746", "HGNC:18431", "HGNC:53111", "HGNC:30862", "HGNC:6047", "HGNC:16429", "HGNC:11713", "HGNC:4907",…
$ entrezgene_accession <chr> "TSTD3", "TSTD1", "PET100", "BCL2L10", "BCL2L11", "TOMM6", "CMC4", "TIMM23", "PET117", "ATP5MF-PTCD1", "ABCB6", "DNM1L", "COX17", "PYURF", "TSFM", "PPIF", "LRPPRC", "TRAP1", "AASS", "SLC25A13", "SLC25A15", "PIGBOS1", "NME6", "DHRS2", "COQ7", "MRPS31", "TIMM17B", "RIDA", "GLYAT", "BCKDK", "ABCA9", "WARS2", "MICU1", "LYPLA1", "TIMM17A", "ACAA2", "TOMM40", "ECI2", "SLC30A9", "TIMM44", "ATP5PD", "PITRM1", "PRDX4", "HTATIP2", "MRPL28", "TXNRD2", "MTHFS", "PAICS", "ATP5MG", "PRELID3A", "MTX2", "FARS2", "POLQ", "YME1L1", "AHCYL1", "MTHFD2", "ALDH1L1", "CLPX", "ME3", "MRPS30", "DHRS4", "FASTK", "PRDX3", "AFG3L2", "TOMM34", "ACOT2", "HTD2", "UQCR11", "IMMT", "LIAS", "TDRKH", "HIBADH", "NUDT6", "NUDT5", "ABCB8", "PLPBP", "AKAP10", "MRPL3", "POLG2", "CA5B", "PXMP4", "RDH13", "FDX2", "HOGA1", "MTFR2", "PARK7", "PHB2", "ACOT7", "SDSL", "LACTB", "SLC25A25", "OSBPL1A", "PTPMT1", "OMA1", "SLC25A26", "MALSU1", "DHRS1", "CKMT1B", "CKMT2", "FAM210B", "COX20", "ACSM1", "TOP1…
$ hgnc_symbol          <chr> "TSTD3", "TSTD1", "PET100", "BCL2L10", "BCL2L11", "TOMM6", "CMC4", "TIMM23", "PET117", "ATP5MF-PTCD1", "ABCB6", "DNM1L", "COX17", "PYURF", "TSFM", "PPIF", "LRPPRC", "TRAP1", "AASS", "SLC25A13", "SLC25A15", "PIGBOS1", "NME6", "DHRS2", "COQ7", "MRPS31", "TIMM17B", "RIDA", "GLYAT", "BCKDK", "ABCA9", "WARS2", "MICU1", "LYPLA1", "TIMM17A", "ACAA2", "TOMM40", "ECI2", "SLC30A9", "TIMM44", "ATP5PD", "PITRM1", "PRDX4", "HTATIP2", "MRPL28", "TXNRD2", "MTHFS", "PAICS", "ATP5MG", "PRELID3A", "MTX2", "FARS2", "POLQ", "YME1L1", "AHCYL1", "MTHFD2", "ALDH1L1", "CLPX", "ME3", "MRPS30", "DHRS4", "FASTK", "PRDX3", "AFG3L2", "TOMM34", "ACOT2", "HTD2", "UQCR11", "IMMT", "LIAS", "TDRKH", "HIBADH", "NUDT6", "NUDT5", "ABCB8", "PLPBP", "AKAP10", "MRPL3", "POLG2", "CA5B", "PXMP4", "RDH13", "FDX2", "HOGA1", "MTFR2", "PARK7", "PHB2", "ACOT7", "SDSL", "LACTB", "SLC25A25", "OSBPL1A", "PTPMT1", "OMA1", "SLC25A26", "MALSU1", "DHRS1", "CKMT1B", "CKMT2", "FAM210B", "COX20", "ACSM1", "TOP1…
$ entrezgene_id        <int> 100130890, 100131187, 100131801, 10017, 10018, 100188893, 100272147, 100287932, 100303755, 100526740, 10058, 10059, 10063, 100996939, 10102, 10105, 10128, 10131, 10157, 10165, 10166, 101928527, 10201, 10202, 10229, 10240, 10245, 10247, 10249, 10295, 10350, 10352, 10367, 10434, 10440, 10449, 10452, 10455, 10463, 10469, 10476, 10531, 10549, 10553, 10573, 10587, 10588, 10606, 10632, 10650, 10651, 10667, 10721, 10730, 10768, 10797, 10840, 10845, 10873, 10884, 10901, 10922, 10935, 10939, 10953, 10965, 109703458, 10975, 10989, 11019, 11022, 11112, 11162, 11164, 11194, 11212, 11216, 11222, 11232, 11238, 11264, 112724, 112812, 112817, 113115, 11315, 11331, 11332, 113675, 114294, 114789, 114876, 114971, 115209, 115286, 115416, 115817, 1159, 1160, 116151, 116228, 116285, 116447, 116540, 116541, 117145, 118487, 118881, 118980, 119559, 122704, 122961, 123096, 123263, 123876, 124454, 124995, 125170, 125228, 125965, 125988, 126129, 126328, 126789, 127018, 128240, 12830…
$ description          <chr> "thiosulfate sulfurtransferase like domain containing 3 [Source:HGNC Symbol;Acc:HGNC:40910]", "thiosulfate sulfurtransferase like domain containing 1 [Source:HGNC Symbol;Acc:HGNC:35410]", "PET100 cytochrome c oxidase chaperone [Source:HGNC Symbol;Acc:HGNC:40038]", "BCL2 like 10 [Source:HGNC Symbol;Acc:HGNC:993]", "BCL2 like 11 [Source:HGNC Symbol;Acc:HGNC:994]", "translocase of outer mitochondrial membrane 6 [Source:HGNC Symbol;Acc:HGNC:34528]", "C-X9-C motif containing 4 [Source:HGNC Symbol;Acc:HGNC:35428]", "translocase of inner mitochondrial membrane 23 [Source:HGNC Symbol;Acc:HGNC:17312]", "PET117 cytochrome c oxidase chaperone [Source:HGNC Symbol;Acc:HGNC:40045]", "ATP5MF-PTCD1 readthrough [Source:HGNC Symbol;Acc:HGNC:38844]", "ATP binding cassette subfamily B member 6 (Langereis blood group) [Source:HGNC Symbol;Acc:HGNC:47]", "dynamin 1 like [Source:HGNC Symbol;Acc:HGNC:2973]", "cytochrome c oxidase copper chaperone COX17 [Source:HGNC Symbol;Acc:HGN…
# ENSG00000230623 not in HGNC
# write.table(hgnc_symbols, "hgnc_symbols_to_match.txt", quote=F, col.names=T, row.names=F, sep="\t")

# human_match_table <- as_tibble(read.delim(file = paste0(directory,"a_reference_annotation/ciona_KY21/hgnc_symbols_to_match.txt"), header = TRUE, sep = "\t", dec = ".",strip.white = T))
# glimpse(human_match_table)

human_mitocarta_HGNC.df <-
  human_mitocarta %>%
  left_join(hgnc_symbols, multiple = "all", by = c("HumanGeneID" = "entrezgene_id")) %>%
  dplyr::select(5, 1, 2, 6, 7) %>%
  filter(!(hgnc_id == "HGNC:1995" & HumanGeneID == 1159)) %>%
  arrange(hgnc_id) %>%
  slice(-(1:3)) # 3 rows are empyty for hgnc_id and HumanGeneID

vis_dat(human_mitocarta_HGNC.df)

vis_miss(human_mitocarta_HGNC.df)

miss_case_summary(human_mitocarta_HGNC.df)
# A tibble: 1,133 × 3
    case n_miss pct_miss
   <int>  <int>    <dbl>
 1     1      0        0
 2     2      0        0
 3     3      0        0
 4     4      0        0
 5     5      0        0
 6     6      0        0
 7     7      0        0
 8     8      0        0
 9     9      0        0
10    10      0        0
# ℹ 1,123 more rows
# human_mitocarta_HGNC <- human_mitocarta_HGNC[!apply(human_mitocarta_HGNC == " ", 1, all),]

# diff <- human_mitocarta_HGNC$Symbol == human_mitocarta_HGNC$hgnc_symbol
# diff2 <- human_mitocarta_HGNC$entrezgene_accession == human_mitocarta_HGNC$hgnc_symbol
# human_mitocarta_HGNC[!diff, "entrezgene_accession"] # CKMT1A and CKMT1B are duplicates
# it look good you can drop the common columns
# PRORP HGNC:19958
# protein only RNase P catalytic subunit [Source:HGNC Symbol;Acc:HGNC:19958]
# protein only RNase P catalytic subunit
1.0.2.1.3 Visualise mitocarta genes
###############################
#### Visualise and select stable mitochondria proteins
###############################

# Select the mitochondrial proteins present in TMTcPro with less than 2 fold change in relative abundance over the time course.
ciona_TMTproC_rowMean_mitocartaHGNC.df <-
  ciona_TMTproC_rowMean.df %>%
  left_join(ciona_annotation_gff3 %>% dplyr::select(2:4), by = c("Protein_Id_transcript" = "TranscriptID")) %>%
  right_join(human_mitocarta_HGNC.df %>% dplyr::select(1, 5), by = c("HGNC" = "hgnc_id")) %>%
  filter_at(vars(4:11), all_vars(. <= 2)) # fold change less than 2 over development

# 2nd filtering step
ciona_rowMean_mitocartaHGNC_flatDynamics.vector <-
  ciona_TMTproC_rowMean_mitocartaHGNC.df %>%
  rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
  rowwise() %>%
  transmute(
    Protein_Id_transcript = Protein_Id_transcript,
    diff_1cell_unfert = abs(`fertE` - unfE),
    diff_16cell_1cell = abs(`cell16` - `fertE`),
    diff_110cell_16cell = abs(`iniG` - `cell16`),
    diff_lateneurula_110cell = abs(`latN` - `iniG`),
    diff_midtailbud_lateneurula = abs(`midTII` - `latN`),
    diff_latetailbud_midtailbud = abs(`latTII` - `midTII`),
    diff_larva_latetailbud = abs(larva - `latTII`)
  ) %>%
  mutate(diff_sum = sum(c_across(starts_with("diff")))) %>% # calculate sum difference
  filter(diff_sum < 1) # keep only proteinz with sum less than 2 sum difference

# Filter step 1
toselect_flat <-
  ciona_rowMean_mitocartaHGNC_flatDynamics.vector$Protein_Id_transcript
# Filter step 2
ciona_rowMean_mitocartaHGNC_flatDynamics.df <- ciona_TMTproC_rowMean_mitocartaHGNC.df[ciona_TMTproC_rowMean_mitocartaHGNC.df$Protein_Id_transcript %in% toselect_flat, ]

# #  lag betwenn stages give the same resutls as above
# df %>%
#   rowwise() %>%
#   mutate(diff_all = sum(abs(diff(c_across(unfertilised:larva), lag = 1)))) %>%
#   filter(diff_all < 1)

HGNCmitocarta.dynamics.1 <- plot_HGNCmitocarta_dynamics(0, 140)
HGNCmitocarta.dynamics.2 <- plot_HGNCmitocarta_dynamics(141, 270)

HGNCmitocarta.dynamics.1 # they look good

HGNCmitocarta.dynamics.2 # they look good

# check result with plot: all proteins
ciona_TMTproC_rowMean_mitocartaHGNC.df %>%
  ungroup() %>%
  rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
  pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
  mutate(stage = fct_relevel(stage, ciona_stages)) %>%
  ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
  geom_point(aes(group = Protein_Id_transcript)) +
  labs(title = "Some are still too dynamics and need to be exclude for normalization") +
  geom_line(aes(group = Protein_Id_transcript)) +
  ylim(0, 2) +
  theme_pubr(base_family = "Avenir") +
  theme(legend.position = "none") +
  ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
  ungroup() %>%
  rename_with(.fn = ~ str_replace(., "_by_rowMean$", ""), .cols = ends_with("_by_rowMean")) %>%
  pivot_longer(cols = c(4:11), names_to = "stage", values_to = "expression") %>%
  mutate(stage = fct_relevel(stage, ciona_stages)) %>%
  ggplot(aes(stage, expression, color = Protein_Id_transcript)) +
  geom_point(aes(group = Protein_Id_transcript)) +
  labs(title = "Good for normalization") +
  geom_line(aes(group = Protein_Id_transcript)) +
  ylim(0, 2) & theme_pubr(base_family = "Avenir",legend = "none")

1.0.2.1.4 Normalisation step using rowMean follow by flat df columnMedian
###############################
#### Normalising using the median column of stable mitochondria proteins
###############################

# Median by column of mito protein
ciona_colMedian_normVector_mitocartaHGNC <-
  ciona_rowMean_mitocartaHGNC_flatDynamics.df %>%
  dplyr::select(4:11) %>%
  ungroup() %>%
  summarize_all(median)

# Normalise by median column
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df <-
  ciona_TMTproC_rowMean.df %>%
  dplyr::select(4:11) %>%
  map2_dfc(ciona_colMedian_normVector_mitocartaHGNC, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
  bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal")

ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df <-
  ciona_TMTproC_rowMean.df %>%
  dplyr::select(4:11) %>%
  map2_dfc(ciona_colMedian_normVector_mitocartaHGNC, `/`) %>% # Divide each column of the selected df by the corresponding value in the colMedian_norm_.vector_flat vector
  rename_with(
    .fn = ~ str_replace(., "_by_rowMean$", "_byRowCol"),
    .cols = ends_with("_by_rowMean")
  ) %>%
  bind_cols(ciona_TMTproC_rowMean.df[, 1:3], ., .name_repair = "minimal") %>%
  left_join(ciona_annotation_gff3 %>% dplyr::select(2:4), by = c("Protein_Id_transcript" = "TranscriptID")) %>%
  relocate(1, 2, 12, 13, 3, everything())
1.0.2.1.5 Assessing the mitocarta normalisation step

Normalization look good, with a cluster of protein ( ribosome and mitocondria) that are flat during embryogeneis . eg housekeeping

###############################
#### Plot before after mitocarta only proteins dynamics 
###############################

# Before
ciona_avgNorm_std <-
  ciona_TMTproC_rowMean_colMedian.df %>%
  filter(Protein_Id_transcript %in% toselect_flat) %>%
  # ungroup() %>%
  summarize(across(c(4:11), median)) %>%
  pivot_longer(everything()) %>%
  mutate(
    name = str_remove(name, "_byRowCol"),
    name = case_when(
      name == "unfE" ~ 1,
      name == "fertE" ~ 2,
      name == "cell16" ~ 3,
      name == "iniG" ~ 4,
      name == "latN" ~ 5,
      name == "midTII" ~ 6,
      name == "latTII" ~ 7,
      name == "larva" ~ 8
    )
  )

# After
ciona_avgNorm_mitocarta <-
  ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
  filter(Protein_Id_transcript %in% toselect_flat) %>%
  summarize(across(c(6:13), median)) %>%
  pivot_longer(everything()) %>%
  mutate(
    name = str_remove(name, "_byRowCol"),
    name = case_when(
      name == "unfE" ~ 1,
      name == "fertE" ~ 2,
      name == "cell16" ~ 3,
      name == "iniG" ~ 4,
      name == "latN" ~ 5,
      name == "midTII" ~ 6,
      name == "latTII" ~ 7,
      name == "larva" ~ 8
    )
  )

function_ggparallel(
  data = ciona_TMTproC_rowMean_colMedian.df %>%
    filter(Protein_Id_transcript %in% toselect_flat) %>% rename_with(~ str_remove(., "_byRowCol")),
  col_names = 4:11,
  group_col = "Protein_Id_transcript",
  plot_title = "Ciona: std norm prot dynamcis"
) +
  theme(legend.position = "none") +
  ggplot(ciona_avgNorm_std, aes(x = name, y = value)) +
  geom_point() +
  theme_pubr(base_family = "Avenir") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Linear Regression of Stage vs. Value") +
  xlab("Stage") +
  ylab("Value") +

  function_ggparallel(
    data = ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
      filter(Protein_Id_transcript %in% toselect_flat) %>% rename_with(~ str_remove(., "_byRowCol")),
    col_names = 6:13,
    group_col = "Protein_Id_transcript",
    plot_title = "Ciona: std mito prot dynamcis"
  ) +
  theme(legend.position = "none") +
  ggplot(ciona_avgNorm_mitocarta, aes(x = name, y = value)) +
  # ggplot(ciona_avgNorm_mitocarta, aes(x = as.numeric(sub("Stage ", "", name)), y = value)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("Linear Regression of Stage vs. Value") +
  theme_pubr(base_family = "Avenir") +
  xlab("Stage") +
  ylab("Value") & theme_pubr(legend = "none",base_family = "Avenir")

# Save the normalised TMTproC data for Ciona
ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
  rename_with(~ str_remove(., "_byRowCol")) %>%
  rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") %>%
  write.table(., paste0(directory_save, "/ciona_proteome_norm.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")
1.0.2.1.6 Same results as above
# # divide each column of df by the corresponding column of vector median 
# norm_ciona_mitocarta <- ciona_TMTproC_rowMean.df
# norm_ciona_mitocarta[, 4:11] <- mapply('/', norm_ciona_mitocarta[, 4:11], ciona_colMedian_normVector_mitocartaHGNC) 
# 
# norm_ciona_mitocarta
# ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df
1.0.2.1.7 Plotting mitocarta normalisation
###############################
#### Kmeans dynamics of Ciona protein dataset with mitocarta normalisation 
###############################

set.seed(777) # reproduce the clusters

# Fit and predict clusters with k = 8
ciona_kmeans_mito <- kmeans(ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df[,6:13], centers = 8, nstart = 100, iter.max = 1000)

# clean up the clustering and add cluster prediction to the original data set
ciona_results_mito <- augment(ciona_kmeans_mito, ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df)
ciona_results_mito %>% slice_head(n = 5)
# A tibble: 5 × 14
  Protein_Id_transcript      Protein_Id_Gene HumanID HGNC       Number_of_peptides unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol .cluster
  <chr>                      <chr>           <chr>   <chr>                   <dbl>         <dbl>          <dbl>           <dbl>         <dbl>         <dbl>           <dbl>           <dbl>          <dbl> <fct>   
1 KY21.Chr1.10.v1.nonSL2-1   KY21.Chr1.10    TAF9    HGNC:11542                  5         1.04           1.01            1.00          0.961         1.00            0.863           1.06           1.07  1       
2 KY21.Chr1.1000.v1.SL1-1    KY21.Chr1.1000  PSMB7   HGNC:9544                  13         1.06           1.08            1.06          0.918         0.943           0.948           1.03           0.970 1       
3 KY21.Chr1.1001.v1.SL1-1    KY21.Chr1.1001  LRRC47  HGNC:29207                 15         0.902          0.856           0.974         0.942         0.981           1.04            1.13           1.18  1       
4 KY21.Chr1.1002.v1.SL1-1    KY21.Chr1.1002  TDRD1   HGNC:11712                  6         3.40           3.23            0.519         0.125         0.189           0.201           0.243          0.180 3       
5 KY21.Chr1.1004.v1.nonSL4-1 KY21.Chr1.1004  DNAJC2  HGNC:13192                 21         0.908          0.881           0.988         0.976         1.03            1.09            1.05           1.08  1       
# summarizes  per-cluster level info eg how many point each cluster and within cluster sum of squares
ciona_cluster_avg_summary_mito <- tidy(ciona_kmeans_mito)
ciona_cluster_avg_summary_mito
# A tibble: 8 × 11
  unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol  size withinss cluster
          <dbl>          <dbl>           <dbl>         <dbl>         <dbl>           <dbl>           <dbl>          <dbl> <int>    <dbl> <fct>  
1         1.01          0.983            1.06         0.993         1.01             0.979           0.999          0.981  3193     572. 1      
2         0.748         0.730            0.847        0.910         1.01             1.10            1.28           1.37   1998     430. 2      
3         1.90          1.86             1.33         0.684         0.611            0.552           0.563          0.552   237     403. 3      
4         0.111         0.0689           0.100        0.0740        0.0956           0.363           1.86           5.32    136     173. 4      
5         5.73          0.492            0.142        0.105         0.127            0.170           0.299          0.710    47     112. 5      
6         0.332         0.326            0.466        0.604         0.999            1.42            1.78           2.02    664     514. 6      
7         0.449         0.497            0.829        1.32          1.47             1.37            1.07           1.01    444     424. 7      
8         0.215         0.143            0.168        0.140         0.342            1.09            2.49           3.32    376     417. 8      
# Visualise dynamics x cluster
function_ggparallel(data = ciona_cluster_avg_summary_mito %>% rename_with(.fn = ~str_replace(., "_byRowCol$", ""),.cols = ends_with("_byRowCol")),
                    col_names = 1:8,
                    group_col = "cluster",
                    plot_title = "Ciona mito: cluster dynamcis") + 
  geom_line(aes(color=cluster, size=as.factor(size))) +
  scale_color_manual(values = ciona_cluster_palette) +
  theme_pubr(base_family = "Avenir") +
  facet_grid(~ cluster) +
  theme(legend.position = "bottom") +
# Visualise each protein behavior x cluster
function_ggparallel(data = ciona_results_mito %>% rename_with(.fn = ~str_replace(., "_byRowCol$", ""),.cols = ends_with("_byRowCol")),
                    col_names = 6:13,
                    group_col = ".cluster",
                    plot_title = "Ciona mito: proteins dynamics x cluster") + 
  geom_line(aes(color=.cluster)) +
  scale_color_manual(values = ciona_cluster_palette) +
  theme_pubr(base_family = "Avenir") +
  facet_grid(~ .cluster) +
  theme(legend.position = "none") &
plot_layout(ncol = 1) & theme(plot.margin = unit(c(0.5, 0.5, 0, 0.5), "lines"), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1)) & theme_pubr(legend = "none",base_family = "Avenir")

# results_mito %>%  pivot_longer(cols = c(6:13),
#                         names_to = "stage", 
#                         values_to = "expression") %>% 
#   mutate(stage = str_remove(stage, "_byRowCol"),
#          stage = fct_relevel(stage, ciona_stages)) %>% 
#   ggplot(aes(stage,expression, color = .cluster)) +
#   geom_point(aes(group = Protein_Id_transcript)) +
#   geom_line(aes(group = Protein_Id_transcript)) +
#   facet_wrap(~.cluster)

# Boxplot all entries
plot_normGood_ciona <-
  ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>% 
    pivot_longer(-c(1:5), names_to = "stage", names_transform = as.factor) %>%  
    filter(str_detect(stage, "_byRowCol")) %>% 
    mutate(stage = str_remove(stage, "_byRowCol")) %>% 
  ggplot(aes(y = factor(stage, levels = rev(ciona_stages)), x = value, fill = stage)) +    
  # geom_line(aes(group = Protein_Id_transcript)) +
  # geom_point(aes(color=stage)) +
  geom_boxplot() +
  scale_color_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  scale_fill_manual(values = setNames(ciona_stages_palette, ciona_stages), guide = "none") +
  labs(title = "Normalised mitocarta good", y = "Developmental stages", x ="Relative abundance") +
  theme_pubr(base_family = "Avenir") + theme(legend.position = "none") + theme_pubr(legend = "none",base_family = "Avenir")
    
plot_original_ciona + plot_normRowCol_ciona + plot_normGood_ciona & theme_pubr(legend = "none",base_family = "Avenir")

1.0.2.2 Ribosomes

Normalising with ribosomes yield simialr results

ciona_ribosome <-  as_tibble(read_excel(path = paste0(directory,"a.reference_annotation/Supp Table - KY21_info.xlsx"),col_names = T,trim_ws = T,sheet = "ribosome_prot"))
glimpse(ciona_ribosome)
Rows: 3,307
Columns: 3
$ GeneID       <chr> "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.702", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.705", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.720", "KY21.Chr1.754", "KY21.Chr1.754", "KY21.Chr1.754", "KY21.Chr1.770", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.791", "KY21.Chr1.795", "KY21.Chr1.795", "KY21.Chr1.797", "KY21.Chr1.824", "KY21.Chr1.884", "KY21.Chr1.904", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.909", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.979", "KY21.Chr1.1015", "KY21.Chr1.1071", "KY21.Chr1.1071", "KY21.Chr1.1072", "KY21.Chr1.1072", "KY…
$ TranscriptID <chr> "KY21.Chr1.702.v1.SL1-1", "KY21.Chr1.702.v1.SL2-1", "KY21.Chr1.702.v1.SL3-1", "KY21.Chr1.702.v1.nonSL4-1", "KY21.Chr1.702.v1.nonSL5-1", "KY21.Chr1.705.v1.ND1-1", "KY21.Chr1.705.v1.SL2-1", "KY21.Chr1.705.v1.SL3-1", "KY21.Chr1.705.v1.SL4-1", "KY21.Chr1.705.v1.SL5-1", "KY21.Chr1.705.v1.SL6-1", "KY21.Chr1.705.v1.SL7-1", "KY21.Chr1.705.v1.SL8-1", "KY21.Chr1.720.v1.nonSL1-1", "KY21.Chr1.720.v1.nonSL2-1", "KY21.Chr1.720.v1.nonSL3-1", "KY21.Chr1.720.v1.nonSL4-1", "KY21.Chr1.720.v1.nonSL5-1", "KY21.Chr1.754.v1.SL1-1", "KY21.Chr1.754.v1.nonSL2-1", "KY21.Chr1.754.v1.nonSL1-1", "KY21.Chr1.770.v1.ND1-1", "KY21.Chr1.791.v1.SL1-1", "KY21.Chr1.791.v1.nonSL2-1", "KY21.Chr1.791.v1.nonSL3-1", "KY21.Chr1.791.v2.SL1-2", "KY21.Chr1.791.v2.nonSL2-2", "KY21.Chr1.791.v2.nonSL3-2", "KY21.Chr1.791.v3.SL1-3", "KY21.Chr1.791.v3.nonSL2-3", "KY21.Chr1.791.v3.nonSL3-3", "KY21.Chr1.791.v4.SL1-4", "KY21.Chr1.791.v4.nonSL2-4", "KY21.Chr1.791.v4.nonSL3-4", "KY21.Chr1.791.v5.SL1-5", "KY21.Chr1.791.v…
$ annotation   <chr> "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "ribo", "…
# Annotated ribosomes
ciona_TMTproC_rowMean_ribosome.df <-
  ciona_TMTproC_rowMean.df %>%  
  dplyr::select(c(1,4:11)) %>% 
  right_join(ciona_ribosome, by = c("Protein_Id_transcript" = "TranscriptID")) %>% 
  dplyr::select(-GeneID) %>% 
  rename_with(
  .fn = ~str_replace(., "_by_rowMean$", ""),
  .cols = ends_with("_by_rowMean")) %>% 
   filter_at(vars(2:9), all_vars(. <= 2)) # fc less than 2

ciona_TMTproC_rowMean_ribosome.df %>% group_by(annotation) %>%  tally() # 314 ribosome associated proteins ==> 298 with cut off 
# A tibble: 1 × 2
  annotation     n
  <chr>      <int>
1 ribo         298
# write.table("HGNCmitocarta.dynamics.df.txt", quote=F, col.names=T, row.names=F, sep="\t")

###############################
#### Normalising using the median column of stable ribosomes proteins
###############################

#  Median by column
ciona_colMedian_normVector_Ribo <- 
  ciona_TMTproC_rowMean_ribosome.df %>% 
  dplyr::select(2:9) %>% 
  ungroup() %>%
  summarize_all(median)

# divide each column of df by the corresponding column of vector median 
ciona_normRibo <- ciona_TMTproC_rowMean.df #%>% drop_na()
ciona_normRibo[, 4:11] <- mapply('/', ciona_normRibo[, 4:11], ciona_colMedian_normVector_Ribo)
colnames(ciona_normRibo) <- c("Protein_Id_transcript", "Protein_Id_Gene", "Number_of_peptides", "unfE_byRowCol", "fertE_byRowCol", "cell16_byRowCol", "iniG_byRowCol", "latN_byRowCol", "midTII_byRowCol", "latTII_byRowCol", "larva_byRowCol")

###############################
#### Kmeans dynamics of Ciona protein dataset with ribosomes normalisation 
###############################

set.seed(777) # reproduce the clusters
# Fit and predict clusters with k = 8
ciona_kmeans_ribo <- kmeans(ciona_normRibo[,4:11], centers = 8, nstart = 100, iter.max = 1000)

# clean up the clustering and add cluster prediction to the original data set
ciona_results_ribo <- augment(ciona_kmeans_ribo, ciona_normRibo)

# summarizes  per-cluster level info 
ciona_cluster_avgSummary_ribo <- tidy(ciona_kmeans_ribo)
ciona_cluster_avgSummary_ribo
# A tibble: 8 × 11
  unfE_byRowCol fertE_byRowCol cell16_byRowCol iniG_byRowCol latN_byRowCol midTII_byRowCol latTII_byRowCol larva_byRowCol  size withinss cluster
          <dbl>          <dbl>           <dbl>         <dbl>         <dbl>           <dbl>           <dbl>          <dbl> <int>    <dbl> <fct>  
1         2.03          1.97             1.32         0.676         0.593            0.549           0.550          0.548   233     431. 1      
2         0.793         0.784            0.859        0.942         1.01             1.09            1.22           1.28   2023     427. 2      
3         1.06          1.05             1.07         1.03          1.01             0.979           0.949          0.922  3142     570. 3      
4         0.117         0.0732           0.101        0.0764        0.0958           0.364           1.77           5.01    136     159. 4      
5         0.349         0.344            0.473        0.627         1.01             1.43            1.69           1.90    666     499. 5      
6         0.226         0.152            0.169        0.144         0.345            1.09            2.37           3.13    378     407. 6      
7         0.471         0.526            0.838        1.34          1.46             1.37            1.03           0.969   470     440. 7      
8         6.02          0.524            0.143        0.108         0.128            0.171           0.285          0.670    47     116. 8      
# ciona_cluster_avgSummary_ribo %>% filter(cluster == 5) 
ciona_cluster_avgSummary_ribo %>%  
  pivot_longer(cols = c(1:8), names_to = "stage", values_to = "expression") %>% 
  mutate(stage = str_remove(stage, "_byRowCol")) %>% 
  mutate(stage = fct_relevel(stage, ciona_stages), cluster = factor(cluster)) %>% 
  ggplot(aes(stage, expression, color = cluster)) +
  geom_point() +
  geom_line(aes(group = interaction(cluster, size), size = as.factor(size))) +
  scale_color_manual(values = ciona_cluster_palette) +
  theme(legend.position = "bottom", plot.title = element_text(size = 10)) +
  labs(title = "Ribosome normalization", x = "") & theme_pubr(legend = "bottom",base_family = "Avenir")

  # scale_size_discrete(name = "Cluster size") + # rename the size legend
  # guides(color = guide_none())

1.0.3 Comparing normalization process

###############################
#### cluster plots
###############################

plot_std_norm <- plot_cluster_norm(ciona_cluster_avgSummary, "Before good normalization") + scale_color_manual(values = c("1" = "#AB3329FF", "2" = "#88A0DCFF", "3" = "#ED968CFF","4" = "#381A61FF", "5" = "#E78429FF", "6" = "#7FA074FF","7" = "#7C4B73FF", "8" = "#F9D14AFF"))
plot_mito_norm <- plot_cluster_norm(ciona_cluster_avg_summary_mito, "Mitocarta normalization") 
plot_ribo_norm <- plot_cluster_norm(ciona_cluster_avgSummary_ribo, "Ribosome normalization") +
  scale_color_manual(values = c("1" = "#7FA074FF", "2" = "#7C4B73FF", "3" = "#88A0DCFF","4" = "#E78429FF", "5" = "#381A61FF", "6" = "#ED968CFF","7" = "#F9D14AFF", "8" = "#AB3329FF")) 

top <-  plot_std_norm + plot_mito_norm + plot_ribo_norm

###############################
#### data pivot and linear regression
###############################

# function for cluster plots
ciona_results_std_pivot <- cluster_pivot_data(ciona_results, 2, c(1,4:11))
ciona_results_mito_pivot <- cluster_pivot_data(ciona_results_mito, 1, c(1,6:13))
ciona_results_ribo_pivot <- cluster_pivot_data(ciona_results_ribo, 3, c(1,4:11))

# Plot linear regression for each normalization method
bottom <- 
  plot_cluster_lr(ciona_results_std_pivot, "Standard normalisaton", FALSE, "B1") + 
  plot_cluster_lr(ciona_results_mito_pivot, "Mitocarta normalisaton", FALSE,"B2") +
  plot_cluster_lr(ciona_results_ribo_pivot, "Ribosome normalisaton", FALSE, "B3") & theme_pubr(base_family = "Avenir")

top / bottom + plot_annotation(title = 'Normalization comparison') + plot_annotation(tag_levels = 'A') &
  theme(text = element_text('Avenir'), plot.title = element_text(size = 30), plot.tag = element_text(size = 15)) 

1.0.4 Relative distribution

Counts per protein scaled to integrate to 1. This is useful, to compare the proteome between species.

###############################
#### reshaping the data to span 0-1 and/or to the number of multiplext TMT channels (8 for ciona; 10 in THao et al.;11 in Itallie et al.)
###############################

# Sum of channels/ stages is 8 (8 channels - 8 development stages)
ciona_proteome <- 
  ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
  rename_with(~ str_remove(., "_byRowCol")) %>%
  rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") |> 
  ungroup()

ciona_proteome_normChan_tidy_wide <- 
  ciona_TMTproC_rowMeancolMedian_flat_goodNorm.df %>%
  rename_with(~ str_remove(., "_byRowCol")) %>%
  rename("protein_id_transcript" = "Protein_Id_transcript", "protein_id_gene" = "Protein_Id_Gene", "human_id" = "HumanID", "hgnc" = "HGNC", "number_of_peptides" = "Number_of_peptides") |> 
  rowwise() %>%
  mutate(across(c(6:13), ~ .x / sum(c_across(6:13))) * 8) %>%
  ungroup() 
# mutate(row_sum = rowSums(across(c(6:13)), na.rm = TRUE))  # checking rowSum protein is 1

# Sum of channels/stages is 1
ciona_proteome_normTo1_tidy_wide <- ciona_proteome_normChan_tidy_wide |> 
    # dplyr::select(-number_of_peptides) |> 
    mutate(across(c(6:13), ~./8 ))
    # mutate(row_sum = rowSums(across(c(6:13)), na.rm = TRUE))  # checking rowSum protein is 1

write.table(ciona_proteome_normTo1_tidy_wide, paste0(directory_save, "/ciona_proteome_normTo1.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")
write.table(ciona_proteome_normChan_tidy_wide, paste0(directory_save, "/ciona_proteome_normToChan.tsv"), quote = F, col.names = T, row.names = F, sep = "\t")

Checking that the pattern is the same

# Picking the protein Thr
a1 <- ciona_proteome |> 
  filter(human_id =="THRB") |>   
  pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |> 
  mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |> 
  ggplot(aes(x = stage, y = value)) +
  geom_point() +
  labs(title = "Scatterplot of Protein Expression Across Stages",
       x = "Stage",
       y = "Expression Level") +
  theme_pubr(base_family = "Avenir") 

a2 <- ciona_proteome_normChan_tidy_wide |> 
  filter(human_id =="THRB") |>   
  pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |> 
  mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |> 
  ggplot(aes(x = stage, y = value)) +
  geom_point() +
  labs(title = "Scatterplot of Protein Expression Across Stages",
       x = "Stage",
       y = "Expression Level") +
  theme_pubr(base_family = "Avenir")

a3 <- ciona_proteome_normTo1_tidy_wide |> 
  filter(human_id =="THRB") |>   
  pivot_longer(cols = unfE:larva, names_to = "stage", values_to = "value") |> 
  mutate(stage = factor(stage, levels = c("unfE", "fertE", "cell16", "iniG", "latN", "midTII", "latTII", "larva"))) |> 
  ggplot(aes(x = stage, y = value)) +
  geom_point() +
  labs(title = "Scatterplot of Protein Expression Across Stages",
       x = "Stage",
       y = "Expression Level") +
  theme_pubr(base_family = "Avenir") 


a1 / a2 /a3 

# & coord_fixed(ratio = 1)
# a1 / a2 /a3 & coord_equal()